################################################################################-
# Replication File for Wratil, Wäckerle and Proksch: Government Rhetoric and the 
# Representation of Public Opinion in International Negotiations
#
# This script creates the dfm used for all topic models (in the paper and the 
# appendices). It does all preprocessing steps, including lemmatization, removal 
# of stopwords, and splitting each text into three parts.
#
# There are no figures or tables created by this script.
################################################################################-

#### Set Up and Load Data ####

library(textstem)              #version:0.1.4
library(devtools)              #version:2.4.4
#devtools::install_github("kbenoit/quanteda.dictionaries") 
library(quanteda.dictionaries) #version 0.31
library(quanteda)              #version 3.2.1
library(tidyverse)             #version 1.3.2
library(lexicon)               #version 1.2.1

load(file="generated_data/corpus_final.RData")

docvars(council_all_nopres,"id_actor") <- paste0(docvars(council_all_nopres,"Transcription"),"_",docvars(council_all_nopres,"Actor"))

#Split speeches in three parts

for(i in 1:length(council_all_nopres$Council_Config_final)){
  text=council_all_nopres[[i]]
  text_split=str_split(text," ")
  first_marker=round(0.2*length(text_split[[1]]))
  second_marker=round(0.8*length(text_split[[1]]))
  text_new=paste(c("##",text_split[[1]][1:first_marker],"##",text_split[[1]][first_marker:second_marker],"##",text_split[[1]][second_marker:length(text_split[[1]])]),collapse = " ")
  council_all_nopres[[i]]=text_new
}
table(str_count(council_all_nopres,"##"))

corp_seg2 <- corpus_segment(council_all_nopres, pattern = "##*")
corp_seg2$part_of_speech <- rep(c("1","2","3"))

corp_seg2$text_original <- as.character(corp_seg2)

for(i in 1:length(corp_seg2)){
  corp_seg2[i] <- as.character(corp_seg2)[i] %>% 
    str_split(" ") %>% 
    unlist() %>% 
    lemmatize_words(dictionary = lexicon::hash_lemmas) %>% 
    paste(collapse=" ")  
}

text_to_search <- "endorse the general approach on the" 

corp_seg2 %>% 
  docvars() %>% 
  filter(grepl(text_to_search,text_original)) %>% 
  select(text_original,Actor,Transcription,date_correct)

text_to_search <- "breakthrough so far is due to the work" 

corp_seg2 %>% 
  docvars() %>% 
  filter(grepl(text_to_search,text_original)) %>% 
  select(text_original,Actor,Transcription,date_correct)

text_to_search <- "poor financial management" 

corp_seg2 %>% 
  docvars() %>% 
  filter(grepl(text_to_search,text_original)) %>% 
  select(text_original,Actor,Transcription,date_correct)

text_to_search <- "come back to this issue under the Greek presidency" 

corp_seg2 %>% 
  docvars() %>% 
  filter(grepl(text_to_search,text_original)) %>% 
  select(text_original,Actor,Transcription,date_correct)

text_to_search <- "introduction nothing is agreed until everything is agreed" 

corp_seg2 %>% 
  docvars() %>% 
  filter(grepl(text_to_search,text_original)) %>% 
  select(text_original,Actor,Transcription,date_correct)

data_tokens <- quanteda::tokens(corp_seg2,
                                remove_numbers = TRUE,
                                remove_punct = TRUE,
                                remove_symbols = TRUE,
                                remove_separators = TRUE,
                                remove_url = TRUE) %>%
  quanteda::tokens_lookup(data_dictionary_us2uk, exclusive = FALSE,
                          capkeys = FALSE) %>%
  tokens_ngrams(n = c(1,2)) %>% 
  tokens_tolower() %>%
  tokens_remove(c(stopwords("english"),
                  "may", "shall", "can",
                  "must", "upon", "with", "without")) %>%
  tokens_select(min_nchar = 3)


data_dfm <- dfm(data_tokens) %>%
  dfm_trim(min_termfreq = 10, min_docfreq = 5)

to_delete <- c()
for(i in 1:ncol(data_dfm)){
  colname <- colnames(data_dfm)[i]
  if(any(endsWith(colname,paste(paste0("_",c(stopwords("english"),
                                             "may", "shall", "can",
                                             "must", "upon", "with", "without")))))|
     any(startsWith(colname,paste(paste0(c(stopwords("english"),
                                           "may", "shall", "can",
                                           "must", "upon", "with", "without"),"_"))))){
    to_delete <- c(to_delete,i)
  }
}
data_dfm <- data_dfm[,-to_delete]
data_dfm$text_copy <- as.character(corp_seg2)

texts.to.keep <- which(rowSums(data_dfm)>0)

data_dfm <- data_dfm[texts.to.keep,]
corp_seg2$id <- 1:length(corp_seg2)
corp_seg2 <- corpus_subset(corp_seg2,id%in%texts.to.keep)

save(file="generated_data/corpus_for_final_analysis.RData",corp_seg2)
save(file="generated_data/dfm_for_stm.RData",data_dfm)

